# Capstone Final Group Project

## *Sofiya Ibrayeva, Elizaveta Titova*

Loading the dataset

zillow <- read.csv("cleaned_zillow.csv")
head(zillow)
##   State        City            Street Zipcode Bedroom Bathroom Area      PPSq
## 1    AL    Saraland          Scott Dr   36571       4        2 1614 148.63693
## 2    AL Robertsdale   Cowpen Creek Rd   36567       3        2 1800 144.38889
## 3    AL Gulf Shores Spinnaker Dr #201   36542       2        2 1250 274.00000
## 4    AL     Chelsea        Mallet Way   35043       3        3 2224 150.62950
## 5    AL  Huntsville    Turtlebrook Ct   35811       3        2 1225 204.08163
## 6    AL  Montgomery       Brampton Ln   36117       3        2 1564  96.54731
##   LotArea MarketEstimate RentEstimate Latitude Longitude ListedPrice
## 1  0.3805         240600         1599 30.81953 -88.09596      239900
## 2  3.2000             NA           NA 30.59000 -87.58038      259900
## 3      NA             NA           NA 30.28496 -87.74792      342500
## 4  0.2600         336200         1932 33.35799 -86.60870      335000
## 5      NA         222700         1679 34.77552 -86.44070      250000
## 6  0.2000         150500         1385 32.37275 -86.16512      151000
# checking the missing values
colMeans(is.na(zillow))
##          State           City         Street        Zipcode        Bedroom 
##   0.0000000000   0.0000000000   0.0000000000   0.0000000000   0.0006172567 
##       Bathroom           Area           PPSq        LotArea MarketEstimate 
##   0.0014990521   0.0000000000   0.0000000000   0.0397689696   0.3190335523 
##   RentEstimate       Latitude      Longitude    ListedPrice 
##   0.2634804462   0.0000000000   0.0000000000   0.0000000000
sum(is.na(zillow))
## [1] 14162
# dropping the missing values
Zillow <- na.omit(zillow)
sum(is.na(Zillow))
## [1] 0
# Checking the duplicate rows
duplicated_rows <- duplicated(Zillow)
num_duplicates <- sum(duplicated_rows)
cat("Number of duplicate rows: ", num_duplicates, "\n")
## Number of duplicate rows:  0
# Viewing the data
head(Zillow)
##   State        City        Street Zipcode Bedroom Bathroom Area      PPSq
## 1    AL    Saraland      Scott Dr   36571       4        2 1614 148.63693
## 4    AL     Chelsea    Mallet Way   35043       3        3 2224 150.62950
## 6    AL  Montgomery   Brampton Ln   36117       3        2 1564  96.54731
## 7    AL        Boaz Greenwood Ave   35957       3        2 1717 139.19627
## 8    AL Albertville Lexington Ave   35950       3        2 1674 149.28315
## 9    AL      Mobile  Emerald Dr W   36619       3        3 2190 134.70320
##     LotArea MarketEstimate RentEstimate Latitude Longitude ListedPrice
## 1 0.3805000         240600         1599 30.81953 -88.09596      239900
## 4 0.2600000         336200         1932 33.35799 -86.60870      335000
## 6 0.2000000         150500         1385 32.37275 -86.16512      151000
## 7 0.3800000         238400         2125 34.21001 -86.13669      239000
## 8 0.3443526         248000         1597 34.27540 -86.21792      249900
## 9 0.3443000         294000         1900 30.59507 -88.20307      295000
# Viewing the structure of the Zillow data
str(Zillow)
## 'data.frame':    14853 obs. of  14 variables:
##  $ State         : chr  "AL" "AL" "AL" "AL" ...
##  $ City          : chr  "Saraland" "Chelsea" "Montgomery" "Boaz" ...
##  $ Street        : chr  "Scott Dr" "Mallet Way" "Brampton Ln" "Greenwood Ave" ...
##  $ Zipcode       : num  36571 35043 36117 35957 35950 ...
##  $ Bedroom       : num  4 3 3 3 3 3 3 3 3 3 ...
##  $ Bathroom      : num  2 3 2 2 2 3 2 2 1 2 ...
##  $ Area          : num  1614 2224 1564 1717 1674 ...
##  $ PPSq          : num  148.6 150.6 96.5 139.2 149.3 ...
##  $ LotArea       : num  0.381 0.26 0.2 0.38 0.344 ...
##  $ MarketEstimate: num  240600 336200 150500 238400 248000 ...
##  $ RentEstimate  : num  1599 1932 1385 2125 1597 ...
##  $ Latitude      : num  30.8 33.4 32.4 34.2 34.3 ...
##  $ Longitude     : num  -88.1 -86.6 -86.2 -86.1 -86.2 ...
##  $ ListedPrice   : num  239900 335000 151000 239000 249900 ...
##  - attr(*, "na.action")= 'omit' Named int [1:7828] 2 3 5 10 11 13 14 15 18 20 ...
##   ..- attr(*, "names")= chr [1:7828] "2" "3" "5" "10" ...

As we can see, the Zillow dataset has 14853 obs. of 14 variables. The predicted or dependent variable is ListedPrice, and other 13 variable will be the independent variable in our multiple regression model.

# Getting the summary
summary(Zillow)
##     State               City              Street             Zipcode     
##  Length:14853       Length:14853       Length:14853       Min.   : 1002  
##  Class :character   Class :character   Class :character   1st Qu.:25425  
##  Mode  :character   Mode  :character   Mode  :character   Median :52302  
##                                                           Mean   :50250  
##                                                           3rd Qu.:75704  
##                                                           Max.   :99950  
##     Bedroom          Bathroom           Area            PPSq         
##  Min.   : 0.000   Min.   : 0.000   Min.   :  240   Min.   :   5.883  
##  1st Qu.: 3.000   1st Qu.: 2.000   1st Qu.: 1424   1st Qu.: 134.771  
##  Median : 3.000   Median : 2.000   Median : 1876   Median : 184.380  
##  Mean   : 3.423   Mean   : 2.436   Mean   : 2126   Mean   : 220.781  
##  3rd Qu.: 4.000   3rd Qu.: 3.000   3rd Qu.: 2496   3rd Qu.: 254.131  
##  Max.   :18.000   Max.   :14.000   Max.   :25496   Max.   :5379.236  
##     LotArea         MarketEstimate      RentEstimate       Latitude    
##  Min.   :  0.0000   Min.   :   15700   Min.   :   155   Min.   :25.45  
##  1st Qu.:  0.1774   1st Qu.:  234000   1st Qu.:  1694   1st Qu.:36.10  
##  Median :  0.2900   Median :  345400   Median :  2199   Median :39.96  
##  Mean   :  2.0545   Mean   :  487722   Mean   :  2658   Mean   :40.04  
##  3rd Qu.:  0.9000   3rd Qu.:  503600   3rd Qu.:  2875   3rd Qu.:42.98  
##  Max.   :700.0000   Max.   :36876900   Max.   :176194   Max.   :65.04  
##    Longitude        ListedPrice      
##  Min.   :-161.77   Min.   :   14900  
##  1st Qu.:-103.74   1st Qu.:  239000  
##  Median : -89.80   Median :  349900  
##  Mean   : -92.91   Mean   :  499206  
##  3rd Qu.: -79.00   3rd Qu.:  500000  
##  Max.   : -67.02   Max.   :40000000
#Getting the counf of listings in each state
count_State <- table(Zillow$State)
count_State_prop <- prop.table(count_State)
count_State
## 
##  AK  AL  AR  AZ  CA  CO  CT  DE  FL  GA  IA  ID  IL  IN  KS  KY  LA  MA  MD  ME 
## 441 216 299 144 431 297 314 289 276 339 352 303 187 266 362 312 113 420 371 224 
##  MI  MN  MO  MS  MT  NC  ND  NE  NH  NJ  NM  NV  NY  OH  OK  OR  PA  RI  SC  SD 
## 407 293 398 386 283 331 370 289 280 232 271 287 190 140 280 164 369 253 355 351 
##  TN  TX  UT  VA  VT  WA  WI  WV  WY 
## 268 373 394 364 304 325 341 255 344
count_State_prop
## 
##          AK          AL          AR          AZ          CA          CO 
## 0.029690972 0.014542517 0.020130613 0.009695011 0.029017707 0.019995960 
##          CT          DE          FL          GA          IA          ID 
## 0.021140510 0.019457349 0.018582105 0.022823672 0.023698916 0.020399919 
##          IL          IN          KS          KY          LA          MA 
## 0.012590049 0.017908840 0.024372181 0.021005857 0.007607891 0.028277116 
##          MD          ME          MI          MN          MO          MS 
## 0.024978119 0.015081128 0.027401872 0.019726655 0.026795933 0.025988016 
##          MT          NC          ND          NE          NH          NJ 
## 0.019053390 0.022285060 0.024910792 0.019457349 0.018851410 0.015619740 
##          NM          NV          NY          OH          OK          OR 
## 0.018245472 0.019322696 0.012792029 0.009425705 0.018851410 0.011041540 
##          PA          RI          SC          SD          TN          TX 
## 0.024843466 0.017033596 0.023900895 0.023631590 0.018043493 0.025112772 
##          UT          VA          VT          WA          WI          WV 
## 0.026526628 0.024506834 0.020467246 0.021881101 0.022958325 0.017168249 
##          WY 
## 0.023160304

Data Visualisation

# Visualising the count of listings in each state 
barplot(count_State,
main = "State Count",
xlab = "State",
ylab = "Count",
col = "steelblue",
cex.names = 0.8,
cex.axis = 0.8,
width = 0.5)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
# Function to remove outliers based on IQR
remove_outliers <- function(df, column) {
  Q1 <- quantile(df[[column]], 0.25, na.rm = TRUE)
  Q3 <- quantile(df[[column]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  df <- df %>% filter(.data[[column]] >= lower_bound & .data[[column]] <= upper_bound)
  return(df)
}

# Remove outliers for each state
Zillow_no_outliers <- Zillow %>%
  group_by(State) %>%
  group_modify(~remove_outliers(.x, "ListedPrice")) %>%
  ungroup()

# Plot the data without outliers
ggplot(Zillow_no_outliers, aes(x = State, y = ListedPrice)) +
  geom_boxplot() +
  labs(title = "Box Plot of Listed Prices by State", x = "State", y = "Listed Price") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Distribution of Property ListedPrices
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ purrr   1.0.1
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
ggplot(Zillow_no_outliers, aes(x = ListedPrice)) +
  geom_histogram(binwidth = 50000, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Property ListedPrices", x = "ListedPrice", y = "Count")

# ListedPrice vs. Area
ggplot(Zillow_no_outliers, aes(x = Area, y = ListedPrice)) +
  geom_point(alpha = 0.6) +
  labs(title = "ListedPrice vs. Area", x = "Area (sqft)", y = "ListedPrice")

# Box Plot of ListedPrices by Stat
ggplot(Zillow_no_outliers, aes(x = State, y = ListedPrice)) +
  geom_boxplot() +
  labs(title = "Box Plot of ListedPrices by State", x = "State", y = "ListedPrice") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Market Estimate vs. Rent Estimate
ggplot(Zillow_no_outliers, aes(x = MarketEstimate, y = RentEstimate)) +
  geom_point(alpha = 0.6) +
  labs(title = "Market Estimate vs. Rent Estimate", x = "Market Estimate", y = "Rent Estimate")

# "Distribution of ListedPrice up to $1,000,000"


# Subset the data where ListedPrice is less than or equal to $1,000,000
sub_df <- subset(Zillow_no_outliers, ListedPrice <= 1000000)

# Create the histogram with KDE
ggplot(sub_df, aes(x = ListedPrice)) +
  geom_histogram(bins = 30, aes(y = ..density..), fill = "lightblue", color = "black") +
  geom_density(alpha = 0.2, fill = "orange") +
  labs(title = "Distribution of ListedPrice up to $1,000,000",
       x = "ListedPrice",
       y = "Frequency") +
  theme_minimal()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.

Statictical Data

# Summary stats
summary(Zillow_no_outliers)
##     State               City              Street             Zipcode     
##  Length:13924       Length:13924       Length:13924       Min.   : 1002  
##  Class :character   Class :character   Class :character   1st Qu.:25433  
##  Mode  :character   Mode  :character   Mode  :character   Median :52240  
##                                                           Mean   :50264  
##                                                           3rd Qu.:75224  
##                                                           Max.   :99950  
##     Bedroom          Bathroom           Area            PPSq         
##  Min.   : 0.000   Min.   : 0.000   Min.   :  240   Min.   :   5.883  
##  1st Qu.: 3.000   1st Qu.: 2.000   1st Qu.: 1400   1st Qu.: 132.139  
##  Median : 3.000   Median : 2.000   Median : 1824   Median : 178.782  
##  Mean   : 3.356   Mean   : 2.305   Mean   : 1969   Mean   : 201.271  
##  3rd Qu.: 4.000   3rd Qu.: 3.000   3rd Qu.: 2371   3rd Qu.: 242.977  
##  Max.   :13.000   Max.   :11.000   Max.   :25496   Max.   :1598.639  
##     LotArea        MarketEstimate     RentEstimate      Latitude    
##  Min.   :  0.000   Min.   :  15700   Min.   :  155   Min.   :25.45  
##  1st Qu.:  0.172   1st Qu.: 226575   1st Qu.: 1650   1st Qu.:36.10  
##  Median :  0.280   Median : 329600   Median : 2113   Median :39.95  
##  Mean   :  1.546   Mean   : 370193   Mean   : 2264   Mean   :40.04  
##  3rd Qu.:  0.780   3rd Qu.: 463025   3rd Qu.: 2700   3rd Qu.:42.96  
##  Max.   :248.000   Max.   :2491400   Max.   :19726   Max.   :65.04  
##    Longitude        ListedPrice     
##  Min.   :-161.77   Min.   :  14900  
##  1st Qu.:-103.63   1st Qu.: 229900  
##  Median : -89.67   Median : 332500  
##  Mean   : -92.86   Mean   : 372886  
##  3rd Qu.: -79.07   3rd Qu.: 465000  
##  Max.   : -67.02   Max.   :2399999
summary_stats <- Zillow_no_outliers %>%
  summarise(
    avg_ListedPrice = mean(ListedPrice, na.rm = TRUE),
    med_ListedPrice = median(ListedPrice, na.rm = TRUE),
    avg_rent = mean(RentEstimate, na.rm = TRUE),
    med_rent = median(RentEstimate, na.rm = TRUE),
    avg_area = mean(Area, na.rm = TRUE),
    med_area = median(Area, na.rm = TRUE)
  )
print(summary_stats)
## # A tibble: 1 × 6
##   avg_ListedPrice med_ListedPrice avg_rent med_rent avg_area med_area
##             <dbl>           <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
## 1         372886.          332500    2264.     2113    1969.     1824
# Getting the correlation values between the dependent and independent values
corr <- sapply(Zillow_no_outliers, function(col) if (is.numeric(col)) cor(Zillow_no_outliers$ListedPrice[complete.cases(Zillow_no_outliers$ListedPrice, col)], col) else NA)

corr
##          State           City         Street        Zipcode        Bedroom 
##             NA             NA             NA     0.09933569     0.27866599 
##       Bathroom           Area           PPSq        LotArea MarketEstimate 
##     0.43755459     0.42444607     0.66103006     0.05139465     0.99227276 
##   RentEstimate       Latitude      Longitude    ListedPrice 
##     0.74962393     0.05395883    -0.18135602     1.00000000
# 3. Correlation Analysis
library(corrplot)
## corrplot 0.92 loaded
cor_matrix <- cor(Zillow_no_outliers[, c("Bedroom", "Bathroom", "Area", "PPSq", "LotArea", "ListedPrice")], use = "complete.obs")
corrplot(cor_matrix, method = "circle")

# Load necessary libraries
#install.packages("shiny")
#install.packages("plotly")
library(shiny)
## Warning: package 'shiny' was built under R version 4.2.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(dplyr)
library(ggplot2)
#install.packages("leaflet")
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.2.3
library(corrplot)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
#install.packages("forecast")
#library(forecast)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
# 2. Geospatial Analysis: Map the distribution of properties
# Define color palette based on property ListedPrices
pal <- colorNumeric(
  palette = c("grey", "blue"),
  domain = c(min(Zillow_no_outliers$ListedPrice), 500000, max(Zillow_no_outliers$ListedPrice)),
  na.color = "transparent"
)

# Create Leaflet map
leaflet(Zillow_no_outliers) %>%
  addTiles() %>%
  addCircleMarkers(
    ~Longitude, ~Latitude,
    radius = ~sqrt(Area) / 10,
    color = ~pal(ListedPrice),
    fillOpacity = 0.7,
    popup = ~paste(
      "<strong>State:</strong>", State, "<br>",
      "<strong>City:</strong>", City, "<br>",
      "<strong>ListedPrice:</strong>", scales::dollar(ListedPrice, prefix = "$")
    )
  ) %>%
  addLegend(
    "bottomright",
    pal = pal,
    values = ~ListedPrice,
    title = "Property ListedPrice",
    labFormat = labelFormat(prefix = "$"),
    opacity = 1
  )

Interactive dashboard

# Combining City and State information
Zillow_no_outliers <- Zillow_no_outliers %>%
  mutate(City_State = paste(City, State, sep = ", "))

# Defining UI for the dashboard
ui <- fluidPage(
  titlePanel("Real Estate Analysis Dashboard"),
  sidebarLayout(
    sidebarPanel(
      h3("Interactive Dashboard"),
      p("This dashboard displays the top 10 most expensive and cheapest states and cities based on median property ListedPrices.")
    ),
    mainPanel(
      tabsetPanel(
        tabPanel("States",
          plotlyOutput("expensive_states_plot"),
          plotlyOutput("cheap_states_plot")
        ),
        tabPanel("Cities",
          plotlyOutput("expensive_cities_plot"),
          plotlyOutput("cheap_cities_plot")
        )
      )
    )
  )
)

# Defining server logic for the dashboard
server <- function(input, output) {
  
  # Preparing data for top 10 most expensive states
  top_expensive_states <- Zillow_no_outliers %>%
    group_by(State) %>%
    summarise(median_ListedPrice = median(ListedPrice, na.rm = TRUE)) %>%
    slice_max(median_ListedPrice, n = 10)
  
  # Preparing data for top 10 cheapest states
  top_cheap_states <- Zillow_no_outliers %>%
    group_by(State) %>%
    summarise(median_ListedPrice = median(ListedPrice, na.rm = TRUE)) %>%
    slice_min(median_ListedPrice, n = 10)
  
  # Preparing data for top 10 most expensive cities
  top_expensive_cities <- Zillow_no_outliers %>%
    group_by(City_State) %>%
    summarise(median_ListedPrice = median(ListedPrice, na.rm = TRUE)) %>%
    slice_max(median_ListedPrice, n = 10)
  
  # Preparing data for top 10 cheapest cities
  top_cheap_cities <- Zillow_no_outliers %>%
    group_by(City_State) %>%
    summarise(median_ListedPrice = median(ListedPrice, na.rm = TRUE)) %>%
    slice_min(median_ListedPrice, n = 10)
  
  # Ploting for top 10 most expensive states
  output$expensive_states_plot <- renderPlotly({
    p <- ggplot(top_expensive_states, aes(x = reorder(State, -median_ListedPrice), y = median_ListedPrice, fill = median_ListedPrice)) + 
      geom_bar(stat = "identity") + 
      scale_fill_gradient(low = "gray", high = "blue") +
      theme_minimal() + 
      labs(title = "Top 10 Most Expensive States", x = "State", y = "Median ListedPrice") + 
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    ggplotly(p)
  })
  
  # Ploting for top 10 cheapest states
  output$cheap_states_plot <- renderPlotly({
    p <- ggplot(top_cheap_states, aes(x = reorder(State, median_ListedPrice), y = median_ListedPrice, fill = median_ListedPrice)) + 
      geom_bar(stat = "identity") + 
      scale_fill_gradient(low = "blue", high = "gray") +
      theme_minimal() + 
      labs(title = "Top 10 Cheapest States", x = "State", y = "Median ListedPrice") + 
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    ggplotly(p)
  })
  
  # Ploting for top 10 most expensive cities
  output$expensive_cities_plot <- renderPlotly({
    p <- ggplot(top_expensive_cities, aes(x = reorder(City_State, -median_ListedPrice), y = median_ListedPrice, fill = median_ListedPrice)) + 
      geom_bar(stat = "identity") + 
      scale_fill_gradient(low = "gray", high = "blue") +
      theme_minimal() + 
      labs(title = "Top 10 Most Expensive Cities", x = "City, State", y = "Median ListedPrice") + 
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    ggplotly(p)
  })
  
  # Ploting for top 10 cheapest cities
  output$cheap_cities_plot <- renderPlotly({
    p <- ggplot(top_cheap_cities, aes(x = reorder(City_State, median_ListedPrice), y = median_ListedPrice, fill = median_ListedPrice)) + 
      geom_bar(stat = "identity") + 
      scale_fill_gradient(low = "blue", high = "gray") +
      theme_minimal() + 
      labs(title = "Top 10 Cheapest Cities", x = "City, State", y = "Median ListedPrice") + 
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    ggplotly(p)
  })
}

# Running the application 
shinyApp(ui = ui, server = server)
## PhantomJS not found. You can install it with webshot::install_phantomjs(). If it is installed, please make sure the phantomjs executable can be found via the PATH variable.
Shiny applications not supported in static R Markdown documents

Splitting the Dataset

library(caret)
# Setting seed for reproducibility
set.seed(123)

# Splitting the data into training and testing sets (70% training, 30% testing)
split_index <- createDataPartition(Zillow_no_outliers$ListedPrice, p = 0.7, list = FALSE)
training_set <- Zillow_no_outliers[split_index, ]
testing_set <- Zillow_no_outliers[-split_index, ]
# Printing the number of rows in each set to verify the split
cat("Number of rows in the training set: ", nrow(training_set), "\n")
## Number of rows in the training set:  9748
cat("Number of rows in the testing set: ", nrow(testing_set), "\n")
## Number of rows in the testing set:  4176
#Viewing the training set 
summary(training_set)
##     State               City              Street             Zipcode     
##  Length:9748        Length:9748        Length:9748        Min.   : 1002  
##  Class :character   Class :character   Class :character   1st Qu.:25425  
##  Mode  :character   Mode  :character   Mode  :character   Median :50672  
##                                                           Mean   :50050  
##                                                           3rd Qu.:75120  
##                                                           Max.   :99950  
##     Bedroom          Bathroom           Area            PPSq         
##  Min.   : 0.000   Min.   : 0.000   Min.   :  240   Min.   :   5.883  
##  1st Qu.: 3.000   1st Qu.: 2.000   1st Qu.: 1398   1st Qu.: 131.994  
##  Median : 3.000   Median : 2.000   Median : 1816   Median : 179.403  
##  Mean   : 3.356   Mean   : 2.308   Mean   : 1970   Mean   : 200.507  
##  3rd Qu.: 4.000   3rd Qu.: 3.000   3rd Qu.: 2370   3rd Qu.: 243.286  
##  Max.   :13.000   Max.   :11.000   Max.   :25496   Max.   :1475.387  
##     LotArea         MarketEstimate     RentEstimate      Latitude    
##  Min.   :  0.0000   Min.   :  21400   Min.   :  155   Min.   :25.96  
##  1st Qu.:  0.1720   1st Qu.: 226475   1st Qu.: 1650   1st Qu.:36.09  
##  Median :  0.2800   Median : 329600   Median : 2101   Median :39.92  
##  Mean   :  1.5414   Mean   : 370054   Mean   : 2265   Mean   :40.00  
##  3rd Qu.:  0.7729   3rd Qu.: 462250   3rd Qu.: 2710   3rd Qu.:42.94  
##  Max.   :248.0000   Max.   :2491400   Max.   :15000   Max.   :65.04  
##    Longitude        ListedPrice       City_State       
##  Min.   :-154.49   Min.   :  20000   Length:9748       
##  1st Qu.:-103.51   1st Qu.: 229900   Class :character  
##  Median : -89.46   Median : 332500   Mode  :character  
##  Mean   : -92.70   Mean   : 372817                     
##  3rd Qu.: -79.00   3rd Qu.: 465000                     
##  Max.   : -67.02   Max.   :2399999
head(training_set)
## # A tibble: 6 × 15
##   State City  Street Zipcode Bedroom Bathr…¹  Area  PPSq LotArea Marke…² RentE…³
##   <chr> <chr> <chr>    <dbl>   <dbl>   <dbl> <dbl> <dbl>   <dbl>   <dbl>   <dbl>
## 1 AK    Wasi… Taffy…   99654       5       2  2409  232.    1.02  547000    2996
## 2 AK    Anch… Sandy…   99507       4       3  2153  221.    0.16  479900    3049
## 3 AK    Anch… E 4th…   99504       1       1   500  258     0.57  298400    1571
## 4 AK    Anch… Georg…   99515       4       3  3031  223.    0.4   678600    3499
## 5 AK    Anch… W 46t…   99517       4       3  2022  240.    0.16  485900    2999
## 6 AK    Fair… Septe…   99709       2       2  1450  324.    2.8   465000    1917
## # … with 4 more variables: Latitude <dbl>, Longitude <dbl>, ListedPrice <dbl>,
## #   City_State <chr>, and abbreviated variable names ¹​Bathroom,
## #   ²​MarketEstimate, ³​RentEstimate
# dropping unnecessary columns
columns_to_drop <- c("City_State", "State", "City", "Street", "PredictedListedPrice")
train_set <- training_set[, !(names(training_set) %in% columns_to_drop)]
test_set <- testing_set[, !(names(testing_set) %in% columns_to_drop)]
# Converting predictors to matrix form
x_train <- as.matrix(train_set[, -which(names(train_set) == "ListedPrice")])
x_test <- as.matrix(test_set[, -which(names(test_set) == "ListedPrice")])
# Ensuring all predictor variables are numeric 
train_set[] <- lapply(train_set, as.numeric)
test_set[] <- lapply(test_set, as.numeric)
# Performing linear regression
lm_model1 <- lm(ListedPrice ~ Bedroom + Bathroom + Area + LotArea  + PPSq , data = train_set)

# Summary of the model 1
summary(lm_model1)
## 
## Call:
## lm(formula = ListedPrice ~ Bedroom + Bathroom + Area + LotArea + 
##     PPSq, data = train_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3009694   -29092     -231    29841  1047284 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.985e+05  4.305e+03 -69.328  < 2e-16 ***
## Bedroom      1.738e+04  1.298e+03  13.391  < 2e-16 ***
## Bathroom     3.658e+04  1.539e+03  23.768  < 2e-16 ***
## Area         1.147e+02  1.591e+00  72.093  < 2e-16 ***
## LotArea      4.177e+02  1.463e+02   2.855  0.00432 ** 
## PPSq         1.506e+03  8.886e+00 169.461  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 93920 on 9742 degrees of freedom
## Multiple R-squared:  0.8072, Adjusted R-squared:  0.8071 
## F-statistic:  8159 on 5 and 9742 DF,  p-value: < 2.2e-16
# Performing linear regression
lm_model2 <- lm(ListedPrice ~ MarketEstimate , data = train_set)

# Summary of the model 2
summary(lm_model2)
## 
## Call:
## lm(formula = ListedPrice ~ MarketEstimate, data = train_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1280704    -5891    -1820     4507   368246 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.754e+03  5.677e+02   17.18   <2e-16 ***
## MarketEstimate 9.811e-01  1.325e-03  740.60   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28260 on 9746 degrees of freedom
## Multiple R-squared:  0.9825, Adjusted R-squared:  0.9825 
## F-statistic: 5.485e+05 on 1 and 9746 DF,  p-value: < 2.2e-16
# Performing linear regression
lm_model3 <- lm(ListedPrice ~ Bedroom + Bathroom + Area + LotArea + RentEstimate + Latitude + Longitude , data = train_set)

# Summary of the model 3
summary(lm_model3)
## 
## Call:
## lm(formula = ListedPrice ~ Bedroom + Bathroom + Area + LotArea + 
##     RentEstimate + Latitude + Longitude, data = train_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1720925   -64580    -7369    55300  1450723 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.778e+05  1.094e+04 -16.248  < 2e-16 ***
## Bedroom      -1.365e+04  1.770e+03  -7.714 1.34e-14 ***
## Bathroom      2.128e+04  2.154e+03   9.879  < 2e-16 ***
## Area          9.788e+00  2.178e+00   4.493 7.10e-06 ***
## LotArea       9.242e+02  2.000e+02   4.621 3.88e-06 ***
## RentEstimate  1.740e+02  1.721e+00 101.096  < 2e-16 ***
## Latitude     -2.631e+03  2.431e+02 -10.824  < 2e-16 ***
## Longitude    -2.564e+03  8.207e+01 -31.242  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 128200 on 9740 degrees of freedom
## Multiple R-squared:  0.6409, Adjusted R-squared:  0.6406 
## F-statistic:  2483 on 7 and 9740 DF,  p-value: < 2.2e-16
# Performing linear regression
lm_model4 <- lm(ListedPrice ~ Bedroom + Bathroom + Area + LotArea + PPSq , data = train_set)

# Summary of the model 4
summary(lm_model4)
## 
## Call:
## lm(formula = ListedPrice ~ Bedroom + Bathroom + Area + LotArea + 
##     PPSq, data = train_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3009694   -29092     -231    29841  1047284 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.985e+05  4.305e+03 -69.328  < 2e-16 ***
## Bedroom      1.738e+04  1.298e+03  13.391  < 2e-16 ***
## Bathroom     3.658e+04  1.539e+03  23.768  < 2e-16 ***
## Area         1.147e+02  1.591e+00  72.093  < 2e-16 ***
## LotArea      4.177e+02  1.463e+02   2.855  0.00432 ** 
## PPSq         1.506e+03  8.886e+00 169.461  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 93920 on 9742 degrees of freedom
## Multiple R-squared:  0.8072, Adjusted R-squared:  0.8071 
## F-statistic:  8159 on 5 and 9742 DF,  p-value: < 2.2e-16
# Performing linear regression
lm_model5 <- lm(ListedPrice ~ Bedroom + Bathroom + Area + LotArea + PPSq + RentEstimate  , data = train_set)

# Summary of the model 5
summary(lm_model5)
## 
## Call:
## lm(formula = ListedPrice ~ Bedroom + Bathroom + Area + LotArea + 
##     PPSq + RentEstimate, data = train_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2230042   -29599     1790    32145   727335 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2.800e+05  3.819e+03 -73.313  < 2e-16 ***
## Bedroom       8.262e+03  1.160e+03   7.124 1.12e-12 ***
## Bathroom      2.358e+04  1.382e+03  17.062  < 2e-16 ***
## Area          8.413e+01  1.522e+00  55.280  < 2e-16 ***
## LotArea       3.801e+02  1.293e+02   2.941  0.00328 ** 
## PPSq          1.213e+03  9.634e+00 125.933  < 2e-16 ***
## RentEstimate  7.112e+01  1.358e+00  52.388  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 82960 on 9741 degrees of freedom
## Multiple R-squared:  0.8496, Adjusted R-squared:  0.8495 
## F-statistic:  9171 on 6 and 9741 DF,  p-value: < 2.2e-16
# Tavle of 5 models
r_squared <- c(summary(lm_model1)$r.squared, summary(lm_model2)$r.squared,  summary(lm_model3)$r.squared,  summary(lm_model4)$r.squared,  summary(lm_model5)$r.squared)
Adj_r_squared <- c(summary(lm_model1)$adj.r.squared, summary(lm_model2)$adj.r.squared, summary(lm_model3)$adj.r.square, summary(lm_model4)$adj.r.square, summary(lm_model5)$adj.r.square)
model_names <- c("model1", "model2","model3","model4","model5" )
r_squared_table <- data.frame(Model = model_names, R_Squared = r_squared, Adjusted_R =Adj_r_squared )
r_squared_table
##    Model R_Squared Adjusted_R
## 1 model1 0.8072263  0.8071274
## 2 model2 0.9825414  0.9825396
## 3 model3 0.6408913  0.6406332
## 4 model4 0.8072263  0.8071274
## 5 model5 0.8496014  0.8495088

Visualizing the best perfoming Linear Model

plot(lm_model2)

Now we are testing the model on the new unseen data

Test_Predictions <- predict(lm_model2, newdata = test_set)
head(Test_Predictions)
##        1        2        3        4        5        6 
## 360990.5 295452.6 294667.7 190768.5 235310.8 623927.0

Calculating the RMSE for the model 2

summary_lm <- summary(lm_model2)
rmse_lmmodel2 <- summary_lm$sigma
rmse_lmmodel2
## [1] 28258.53
# Visualising the actual and predicted prices

# Combine actual prices and predicted prices into a data frame
results <- data.frame(Actual = test_set$ListedPrice, Predicted = Test_Predictions)

# Plotting actual vs predicted prices
ggplot(results, aes(x = Actual, y = Predicted)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "blue", linetype = "dashed") +  # 45-degree line for reference
  labs(x = "Actual Price", y = "Predicted Price", title = "Actual vs Predicted Prices")

Lasso Model

# Loading necessary libraries 
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-8
# Training the Lasso regression model on the training set
lasso_model <- cv.glmnet(x_train, train_set$ListedPrice, alpha = 1)

# Printing the best lambda value chosen by cross-validation
print(paste("Best lambda value:", lasso_model$lambda.min))
## [1] "Best lambda value: 603.699718995695"
#Making predictions on the testing set
lasso_predictions <- predict(lasso_model, newx = x_test, s = "lambda.min")

# Evaluating the model performance (e.g., calculate RMSE)
rmse_lasso <- sqrt(mean((lasso_predictions - test_set$ListedPrice)^2))
print(paste("Lasso Regression RMSE on Testing Set:", rmse_lasso))
## [1] "Lasso Regression RMSE on Testing Set: 21925.9505077498"
# Plotting actual vs. predicted values
plot(test_set$ListedPrice, lasso_predictions,
     main = "Actual vs. Predicted ListedPrice",
     xlab = "Actual ListedPrice",
     ylab = "Predicted ListedPrice",
     col = "blue", pch = 16)


abline(0, 1, col = "red", lty = 2)
legend("topleft", legend = c("Data Points", "Ideal Prediction"),
       col = c("blue", "red"), pch = c(16, NA), lty = c(NA, 2))

Decision Trees

library(rpart)
library(rpart.plot)
# Training the model with minsplit = 30
model_minsplit_30 <- rpart(ListedPrice ~ ., data = train_set, method = 'anova', control = rpart.control(minsplit = 30))

# Training the model with minsplit = 50
model_minsplit_50 <- rpart(ListedPrice ~ ., data = train_set, method = 'anova', control = rpart.control(minsplit = 50))

# Training the model with minsplit = 60
model_minsplit_60 <- rpart(ListedPrice ~ ., data = train_set, method = 'anova', control = rpart.control(minsplit = 60))

# Training the model with minsplit = 90
model_minsplit_90 <- rpart(ListedPrice ~ ., data = train_set, method = 'anova', control = rpart.control(minsplit = 90))

# Evaluating the models on validation set
pred_minsplit_30 <- predict(model_minsplit_30, test_set)
pred_minsplit_50 <- predict(model_minsplit_50, test_set)
pred_minsplit_60 <- predict(model_minsplit_60, test_set)
pred_minsplit_90 <- predict(model_minsplit_90, test_set)

# Calculating performance metric (RMSE)
rmse_minsplit_30 <- sqrt(mean((pred_minsplit_30 - test_set$ListedPrice)^2))
rmse_minsplit_50 <- sqrt(mean((pred_minsplit_50 - test_set$ListedPrice)^2))
rmse_minsplit_60 <- sqrt(mean((pred_minsplit_60 - test_set$ListedPrice)^2))
rmse_minsplit_90 <- sqrt(mean((pred_minsplit_90 - test_set$ListedPrice)^2))

# Comparing RMSE values
rmse_values <- c(minsplit_30 = rmse_minsplit_30, minsplit_50 = rmse_minsplit_50, minsplit_60 = rmse_minsplit_60, minsplit_90 = rmse_minsplit_90)
print(rmse_values)
## minsplit_30 minsplit_50 minsplit_60 minsplit_90 
##    45948.85    45948.85    45948.85    53621.69

After analyzing the results, we can say that xx and 50 and 60 perform the best among others.

# Visualising the best perfoming model
rpart.plot(model_minsplit_50, extra = 1, type = 0, box.palette = c("lightblue", "grey"), under = TRUE)

# Plot actual vs. predicted values
plot(test_set$ListedPrice, pred_minsplit_50,
     main = "Actual vs. Predicted ListedPrice",
     xlab = "Actual ListedPrice",
     ylab = "Predicted ListedPrice",
     col = "blue", pch = 16)

abline(0, 1, col = "red", lty = 2)

legend("topleft", legend = c("Data Points", "Ideal Prediction"),
       col = c("blue", "red"), pch = c(16, NA), lty = c(NA, 2))

Ridge Model

# Training the Ridge regression model on the training set
ridge_model <- cv.glmnet(x_train, train_set$ListedPrice, alpha = 0)

# Printing the best lambda value chosen by cross-validation
print(paste("Best lambda value:", ridge_model$lambda.min))
## [1] "Best lambda value: 21197.0546328568"
# Making predictions on the testing set
ridge_predictions <- predict(ridge_model, newx = x_test, s = "lambda.min")

# Evaluating the model performance (e.g., calculate RMSE)
rmse_ridge <- sqrt(mean((ridge_predictions - test_set$ListedPrice)^2))
print(paste("Ridge Regression RMSE on Testing Set:", rmse_ridge))
## [1] "Ridge Regression RMSE on Testing Set: 36016.1236291789"
# Plotting actual vs. predicted values
plot(test_set$ListedPrice, ridge_predictions,
     main = "Actual vs. Predicted ListedPrice",
     xlab = "Actual ListedPrice",
     ylab = "Predicted ListedPrice",
     col = "blue", pch = 16)

abline(0, 1, col = "red", lty = 2)

legend("topleft", legend = c("Data Points", "Ideal Prediction"),
       col = c("blue", "red"), pch = c(16, NA), lty = c(NA, 2))

Creating a table of all RMSE values

rmse_lmmodel2 <- sqrt(mean((Test_Predictions - testing_set$ListedPrice)^2))  

model_names <- c("Lasso", "Decision Tree", "Ridge", "Linear Model")
rmse_values <- c(rmse_lasso, rmse_minsplit_50, rmse_ridge, rmse_lmmodel2)
rmse_table <- data.frame(Model = model_names, RMSE = rmse_values)

# Formatting RMSE values for better readability
rmse_table$RMSE <- sprintf("%.2f", rmse_table$RMSE)  

# Printing the RMSE table
print(rmse_table)
##           Model     RMSE
## 1         Lasso 21925.95
## 2 Decision Tree 45948.85
## 3         Ridge 36016.12
## 4  Linear Model 22724.23